Market Rent Prediction¶

  • Appling a random forest model to identify significant factors and predict market rent
  • Using scikit learn for model fitting

Todo¶

  • Check more factors
  • Include date considerations
  • Create data visualizations
In [ ]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import plotly.io as pio
pio.renderers.default='notebook'

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
In [ ]:
raw_rental_df = pd.read_parquet('data/clean_housing_data.parquet')

rental_df = raw_rental_df.copy(deep= True)

# Only training/testing model on initial date
initial_date = rental_df['date'].iloc[0]
initial_rental_df = rental_df[rental_df['date'] == initial_date]

# initial_rental_df = initial_rental_df.set_index('id')
In [ ]:
initial_rental_df.columns
Out[ ]:
Index(['id', 'unit_code', 'property_code', 'market_id', 'market_name',
       'address_1', 'address_2', 'city', 'state', 'zipcode', 'country', 'beds',
       'baths', 'sqft', 'market_rent', 'lat', 'lng', 'available_at', 'unit_id',
       'unit_status', 'is_syndicated', 'is_syndicated_ils', 'is_on_special',
       'new_construction', 'est_rehab_complete_date', 'rehab_type',
       'submarket_names', 'subdivision', 'bid_type', 'asset_review_type',
       'days_on_market', 'formattedAddress', 'date', 'has_virtual_tour',
       'btr_community', 'model_home'],
      dtype='object')

Factor Importance Screening¶

In [ ]:
filtered_rental_df = initial_rental_df.loc[:, ['state', 'city', 'market_name', 'submarket_names', 'beds', 'baths', 'sqft', 'days_on_market', 'market_rent']]
filtered_rental_df.head()
Out[ ]:
state city market_name submarket_names beds baths sqft days_on_market market_rent
0 CA Los Angeles Southern California LA Metro West 3 1.0 1236 0 3299.0
1 CA Los Angeles Southern California LA Metro West 3 1.0 1422 9 3575.0
2 CA Los Angeles Southern California LA Metro West 2 3.0 1499 0 3999.0
3 CA Burbank Southern California San Fernando Valley East 3 2.0 1500 0 4399.0
4 CA North Hollywood Southern California San Fernando Valley East 3 2.0 1553 0 3899.0
In [ ]:
# Preprocessing adapted from: https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline_column_transformer.html

X_regressors = filtered_rental_df.drop(columns= 'market_rent')
y_response = filtered_rental_df['market_rent']

categorical_cols = make_column_selector(dtype_include=object)(X_regressors)
numerical_cols = make_column_selector(dtype_include=np.number)(X_regressors)
In [ ]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(handle_unknown= 'ignore'), categorical_cols),
    ('standard_scaler', StandardScaler(), numerical_cols)
])

model = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', RandomForestRegressor(n_estimators=10, random_state=42))
])
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X_regressors, y_response, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error: {:.2f}".format(mse))
print("R-squared score: {:.2f}".format(r2))
Mean squared error: 66066.78
R-squared score: 0.73
In [ ]:
model_feature_names = model.named_steps['preprocessor'].get_feature_names_out()
model_feature_importance = model.named_steps['model'].feature_importances_

pd.DataFrame({
    'Feature': model_feature_names, 
    'Importance': model_feature_importance
}).sort_values('Importance', ascending= False).round(4).head(15)
Out[ ]:
Feature Importance
610 standard_scaler__sqft 0.2984
392 one-hot-encoder__market_name_South Florida/Miami 0.1812
1 one-hot-encoder__state_CA 0.1324
3 one-hot-encoder__state_FL 0.0428
391 one-hot-encoder__market_name_Seattle 0.0294
10 one-hot-encoder__state_WA 0.0154
609 standard_scaler__baths 0.0137
2 one-hot-encoder__state_CO 0.0133
608 standard_scaler__beds 0.0111
383 one-hot-encoder__market_name_Denver 0.0107
384 one-hot-encoder__market_name_Houston 0.0094
611 standard_scaler__days_on_market 0.0092
499 one-hot-encoder__submarket_names_Manatee 0.0088
545 one-hot-encoder__submarket_names_Port St. Lucie 0.0068
438 one-hot-encoder__submarket_names_Denver South 0.0064

Model Tuning¶

  • Including both market_name and state was redundant
  • city is too specific, submarket_names should be used instead
  • n_estimators improved performance but had diminishing returns, n_estimators=50 was used to reduce training/prediction time
In [ ]:
filtered_rental_df = initial_rental_df.loc[:, ['market_name', 'submarket_names', 'beds', 'baths', 'sqft', 'market_rent']]
filtered_rental_df.head()
Out[ ]:
market_name submarket_names beds baths sqft market_rent
0 Southern California LA Metro West 3 1.0 1236 3299.0
1 Southern California LA Metro West 3 1.0 1422 3575.0
2 Southern California LA Metro West 2 3.0 1499 3999.0
3 Southern California San Fernando Valley East 3 2.0 1500 4399.0
4 Southern California San Fernando Valley East 3 2.0 1553 3899.0
In [ ]:
X_regressors = filtered_rental_df.drop(columns= 'market_rent')
y_response = filtered_rental_df['market_rent']

categorical_cols = make_column_selector(dtype_include=object)(X_regressors)
numerical_cols = make_column_selector(dtype_include=np.number)(X_regressors)
In [ ]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(handle_unknown= 'ignore'), categorical_cols),
    ('standard_scaler', StandardScaler(), numerical_cols)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=10, random_state=42))
])
In [ ]:
# Higher n_estimators had diminishing returns, increases run time
grid_search_n_estimators = GridSearchCV(model, {'model__n_estimators': [3, 5, 10, 50, 100, 150]}, cv=KFold(n_splits= 5, shuffle= True))

grid_search_n_estimators.fit(X_regressors, y_response)

print("Best hyperparameters:", grid_search_n_estimators.best_params_)
print("Best cross-validation score:", grid_search_n_estimators.best_score_)
Best hyperparameters: {'model__n_estimators': 150}
Best cross-validation score: 0.7727790744005573
In [ ]:
cv_n_estimators_results_df = pd.DataFrame({key: grid_search_n_estimators.cv_results_[key] for key in [
    'param_model__n_estimators', 
    'mean_fit_time',
    'std_fit_time',
    'mean_test_score', 
    'std_test_score']
})

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(
        name= 'mean_test_score',
        x= cv_n_estimators_results_df['param_model__n_estimators'],
        y= cv_n_estimators_results_df['mean_test_score'],
    ),
    secondary_y= False
)

fig.add_trace(
    go.Scatter(
        name= 'mean_fit_time',
        x= cv_n_estimators_results_df['param_model__n_estimators'],
        y= cv_n_estimators_results_df['mean_fit_time'],
    ),
    secondary_y= True
)

fig.update_layout(
    title= "<b>Diminishing Returns of n_estimators</b>",
    xaxis_title= "n_estimators",
)

fig.update_yaxes(
    title= "Mean Test Score",
    secondary_y= False
)
fig.update_yaxes(
    title= "Mean Fit Time",
    showgrid= False,
    secondary_y= True
)

fig.show()
In [ ]:
# KFold used instead of StratifiedKFold due to a few low n factors
grid_search = GridSearchCV(model, {
    'model__n_estimators': [5, 10, 50],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    }, 
    cv=KFold(n_splits= 5, shuffle= True)
)

grid_search.fit(X_regressors, y_response)

print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
Best hyperparameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 50}
Best cross-validation score: 0.7667670689664612
In [ ]:
# grid_search.cv_results_.keys()
cv_results_df = pd.DataFrame({key: grid_search.cv_results_[key] for key in [
    'param_model__max_depth', 
    'param_model__min_samples_leaf', 
    'param_model__min_samples_split', 
    'param_model__n_estimators', 
    'mean_fit_time',
    'std_fit_time',
    'mean_test_score', 
    'std_test_score']
})

cv_results_df.sort_values('mean_test_score', ascending= False).head(10)
Out[ ]:
param_model__max_depth param_model__min_samples_leaf param_model__min_samples_split param_model__n_estimators mean_fit_time std_fit_time mean_test_score std_test_score
5 None 1 5 50 1.174984 0.040583 0.766767 0.018953
8 None 1 10 50 0.963427 0.033524 0.764385 0.022908
2 None 1 2 50 1.533656 0.038395 0.762275 0.022913
14 None 2 5 50 0.979871 0.041501 0.762103 0.022704
17 None 2 10 50 0.843054 0.049420 0.760852 0.021686
11 None 2 2 50 1.052152 0.012300 0.759755 0.023039
7 None 1 10 10 0.209202 0.014839 0.750055 0.023464
4 None 1 5 10 0.230187 0.005783 0.749804 0.022449
16 None 2 10 10 0.165113 0.002566 0.748631 0.019231
1 None 1 2 10 0.333872 0.028559 0.747301 0.028411

Final Model¶

In [ ]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(min_samples_split= 5, n_estimators= 50, random_state=42))
])
In [ ]:
cv_results = cross_validate(model, X_regressors, y_response, cv= KFold(n_splits= 5, shuffle= True), scoring=('r2', 'neg_mean_squared_error'))

scores = cv_results["test_r2"]
print("The mean cross-validation r2 is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")
The mean cross-validation r2 is: 0.776 ± 0.013
In [ ]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        name= 'predictions',
        mode='markers',
        x= y_test,
        y= y_pred
    )
)

fig.add_trace(
    go.Scatter(
        name= 'actual',
        mode='lines',
        x= y_test,
        y= y_test
    )
)

fig.update_layout(
    title= "<b>Predicted vs Actual Market Rent</b>", 
    xaxis_title = "Actual Market Rent",
    yaxis_title = "Predicted Market Rent"
)


fig.show()